#install pacakage
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from pyGRNN import GRNN
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn import tree
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree
from sklearn.metrics import plot_confusion_matrix
from matplotlib.pyplot import figure
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import warnings
warnings.filterwarnings(action='ignore')
# precision-recall curve and f1
# load dataset
df = pd.read_csv("dataset.csv")
# data dimension
df.shape
(91713, 85)
# df head
df.head()
| encounter_id | patient_id | hospital_id | age | bmi | elective_surgery | ethnicity | gender | height | icu_admit_source | ... | diabetes_mellitus | hepatic_failure | immunosuppression | leukemia | lymphoma | solid_tumor_with_metastasis | apache_3j_bodysystem | apache_2_bodysystem | Unnamed: 83 | hospital_death | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 66154 | 25312 | 118 | 68.0 | 22.73 | 0 | Caucasian | M | 180.3 | Floor | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Sepsis | Cardiovascular | NaN | 0 |
| 1 | 114252 | 59342 | 81 | 77.0 | 27.42 | 0 | Caucasian | F | 160.0 | Floor | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Respiratory | Respiratory | NaN | 0 |
| 2 | 119783 | 50777 | 118 | 25.0 | 31.95 | 0 | Caucasian | F | 172.7 | Accident & Emergency | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Metabolic | Metabolic | NaN | 0 |
| 3 | 79267 | 46918 | 118 | 81.0 | 22.64 | 1 | Caucasian | F | 165.1 | Operating Room / Recovery | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Cardiovascular | Cardiovascular | NaN | 0 |
| 4 | 92056 | 34377 | 33 | 19.0 | NaN | 0 | Caucasian | M | 188.0 | Accident & Emergency | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Trauma | Trauma | NaN | 0 |
5 rows × 85 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 91713 entries, 0 to 91712 Data columns (total 85 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 encounter_id 91713 non-null int64 1 patient_id 91713 non-null int64 2 hospital_id 91713 non-null int64 3 age 87485 non-null float64 4 bmi 88284 non-null float64 5 elective_surgery 91713 non-null int64 6 ethnicity 90318 non-null object 7 gender 91688 non-null object 8 height 90379 non-null float64 9 icu_admit_source 91601 non-null object 10 icu_id 91713 non-null int64 11 icu_stay_type 91713 non-null object 12 icu_type 91713 non-null object 13 pre_icu_los_days 91713 non-null float64 14 weight 88993 non-null float64 15 apache_2_diagnosis 90051 non-null float64 16 apache_3j_diagnosis 90612 non-null float64 17 apache_post_operative 91713 non-null int64 18 arf_apache 90998 non-null float64 19 gcs_eyes_apache 89812 non-null float64 20 gcs_motor_apache 89812 non-null float64 21 gcs_unable_apache 90676 non-null float64 22 gcs_verbal_apache 89812 non-null float64 23 heart_rate_apache 90835 non-null float64 24 intubated_apache 90998 non-null float64 25 map_apache 90719 non-null float64 26 resprate_apache 90479 non-null float64 27 temp_apache 87605 non-null float64 28 ventilated_apache 90998 non-null float64 29 d1_diasbp_max 91548 non-null float64 30 d1_diasbp_min 91548 non-null float64 31 d1_diasbp_noninvasive_max 90673 non-null float64 32 d1_diasbp_noninvasive_min 90673 non-null float64 33 d1_heartrate_max 91568 non-null float64 34 d1_heartrate_min 91568 non-null float64 35 d1_mbp_max 91493 non-null float64 36 d1_mbp_min 91493 non-null float64 37 d1_mbp_noninvasive_max 90234 non-null float64 38 d1_mbp_noninvasive_min 90234 non-null float64 39 d1_resprate_max 91328 non-null float64 40 d1_resprate_min 91328 non-null float64 41 d1_spo2_max 91380 non-null float64 42 d1_spo2_min 91380 non-null float64 43 d1_sysbp_max 91554 non-null float64 44 d1_sysbp_min 91554 non-null float64 45 d1_sysbp_noninvasive_max 90686 non-null float64 46 d1_sysbp_noninvasive_min 90686 non-null float64 47 d1_temp_max 89389 non-null float64 48 d1_temp_min 89389 non-null float64 49 h1_diasbp_max 88094 non-null float64 50 h1_diasbp_min 88094 non-null float64 51 h1_diasbp_noninvasive_max 84363 non-null float64 52 h1_diasbp_noninvasive_min 84363 non-null float64 53 h1_heartrate_max 88923 non-null float64 54 h1_heartrate_min 88923 non-null float64 55 h1_mbp_max 87074 non-null float64 56 h1_mbp_min 87074 non-null float64 57 h1_mbp_noninvasive_max 82629 non-null float64 58 h1_mbp_noninvasive_min 82629 non-null float64 59 h1_resprate_max 87356 non-null float64 60 h1_resprate_min 87356 non-null float64 61 h1_spo2_max 87528 non-null float64 62 h1_spo2_min 87528 non-null float64 63 h1_sysbp_max 88102 non-null float64 64 h1_sysbp_min 88102 non-null float64 65 h1_sysbp_noninvasive_max 84372 non-null float64 66 h1_sysbp_noninvasive_min 84372 non-null float64 67 d1_glucose_max 85906 non-null float64 68 d1_glucose_min 85906 non-null float64 69 d1_potassium_max 82128 non-null float64 70 d1_potassium_min 82128 non-null float64 71 apache_4a_hospital_death_prob 83766 non-null float64 72 apache_4a_icu_death_prob 83766 non-null float64 73 aids 90998 non-null float64 74 cirrhosis 90998 non-null float64 75 diabetes_mellitus 90998 non-null float64 76 hepatic_failure 90998 non-null float64 77 immunosuppression 90998 non-null float64 78 leukemia 90998 non-null float64 79 lymphoma 90998 non-null float64 80 solid_tumor_with_metastasis 90998 non-null float64 81 apache_3j_bodysystem 90051 non-null object 82 apache_2_bodysystem 90051 non-null object 83 Unnamed: 83 0 non-null float64 84 hospital_death 91713 non-null int64 dtypes: float64(71), int64(7), object(7) memory usage: 59.5+ MB
# count missing values by column
dict(df.isna().sum())
{'encounter_id': 0,
'patient_id': 0,
'hospital_id': 0,
'age': 4228,
'bmi': 3429,
'elective_surgery': 0,
'ethnicity': 1395,
'gender': 25,
'height': 1334,
'icu_admit_source': 112,
'icu_id': 0,
'icu_stay_type': 0,
'icu_type': 0,
'pre_icu_los_days': 0,
'weight': 2720,
'apache_2_diagnosis': 1662,
'apache_3j_diagnosis': 1101,
'apache_post_operative': 0,
'arf_apache': 715,
'gcs_eyes_apache': 1901,
'gcs_motor_apache': 1901,
'gcs_unable_apache': 1037,
'gcs_verbal_apache': 1901,
'heart_rate_apache': 878,
'intubated_apache': 715,
'map_apache': 994,
'resprate_apache': 1234,
'temp_apache': 4108,
'ventilated_apache': 715,
'd1_diasbp_max': 165,
'd1_diasbp_min': 165,
'd1_diasbp_noninvasive_max': 1040,
'd1_diasbp_noninvasive_min': 1040,
'd1_heartrate_max': 145,
'd1_heartrate_min': 145,
'd1_mbp_max': 220,
'd1_mbp_min': 220,
'd1_mbp_noninvasive_max': 1479,
'd1_mbp_noninvasive_min': 1479,
'd1_resprate_max': 385,
'd1_resprate_min': 385,
'd1_spo2_max': 333,
'd1_spo2_min': 333,
'd1_sysbp_max': 159,
'd1_sysbp_min': 159,
'd1_sysbp_noninvasive_max': 1027,
'd1_sysbp_noninvasive_min': 1027,
'd1_temp_max': 2324,
'd1_temp_min': 2324,
'h1_diasbp_max': 3619,
'h1_diasbp_min': 3619,
'h1_diasbp_noninvasive_max': 7350,
'h1_diasbp_noninvasive_min': 7350,
'h1_heartrate_max': 2790,
'h1_heartrate_min': 2790,
'h1_mbp_max': 4639,
'h1_mbp_min': 4639,
'h1_mbp_noninvasive_max': 9084,
'h1_mbp_noninvasive_min': 9084,
'h1_resprate_max': 4357,
'h1_resprate_min': 4357,
'h1_spo2_max': 4185,
'h1_spo2_min': 4185,
'h1_sysbp_max': 3611,
'h1_sysbp_min': 3611,
'h1_sysbp_noninvasive_max': 7341,
'h1_sysbp_noninvasive_min': 7341,
'd1_glucose_max': 5807,
'd1_glucose_min': 5807,
'd1_potassium_max': 9585,
'd1_potassium_min': 9585,
'apache_4a_hospital_death_prob': 7947,
'apache_4a_icu_death_prob': 7947,
'aids': 715,
'cirrhosis': 715,
'diabetes_mellitus': 715,
'hepatic_failure': 715,
'immunosuppression': 715,
'leukemia': 715,
'lymphoma': 715,
'solid_tumor_with_metastasis': 715,
'apache_3j_bodysystem': 1662,
'apache_2_bodysystem': 1662,
'Unnamed: 83': 91713,
'hospital_death': 0}
# check percentage of missing values by column
dict(round(df.isna().sum()/ len(df),2))
{'encounter_id': 0.0,
'patient_id': 0.0,
'hospital_id': 0.0,
'age': 0.05,
'bmi': 0.04,
'elective_surgery': 0.0,
'ethnicity': 0.02,
'gender': 0.0,
'height': 0.01,
'icu_admit_source': 0.0,
'icu_id': 0.0,
'icu_stay_type': 0.0,
'icu_type': 0.0,
'pre_icu_los_days': 0.0,
'weight': 0.03,
'apache_2_diagnosis': 0.02,
'apache_3j_diagnosis': 0.01,
'apache_post_operative': 0.0,
'arf_apache': 0.01,
'gcs_eyes_apache': 0.02,
'gcs_motor_apache': 0.02,
'gcs_unable_apache': 0.01,
'gcs_verbal_apache': 0.02,
'heart_rate_apache': 0.01,
'intubated_apache': 0.01,
'map_apache': 0.01,
'resprate_apache': 0.01,
'temp_apache': 0.04,
'ventilated_apache': 0.01,
'd1_diasbp_max': 0.0,
'd1_diasbp_min': 0.0,
'd1_diasbp_noninvasive_max': 0.01,
'd1_diasbp_noninvasive_min': 0.01,
'd1_heartrate_max': 0.0,
'd1_heartrate_min': 0.0,
'd1_mbp_max': 0.0,
'd1_mbp_min': 0.0,
'd1_mbp_noninvasive_max': 0.02,
'd1_mbp_noninvasive_min': 0.02,
'd1_resprate_max': 0.0,
'd1_resprate_min': 0.0,
'd1_spo2_max': 0.0,
'd1_spo2_min': 0.0,
'd1_sysbp_max': 0.0,
'd1_sysbp_min': 0.0,
'd1_sysbp_noninvasive_max': 0.01,
'd1_sysbp_noninvasive_min': 0.01,
'd1_temp_max': 0.03,
'd1_temp_min': 0.03,
'h1_diasbp_max': 0.04,
'h1_diasbp_min': 0.04,
'h1_diasbp_noninvasive_max': 0.08,
'h1_diasbp_noninvasive_min': 0.08,
'h1_heartrate_max': 0.03,
'h1_heartrate_min': 0.03,
'h1_mbp_max': 0.05,
'h1_mbp_min': 0.05,
'h1_mbp_noninvasive_max': 0.1,
'h1_mbp_noninvasive_min': 0.1,
'h1_resprate_max': 0.05,
'h1_resprate_min': 0.05,
'h1_spo2_max': 0.05,
'h1_spo2_min': 0.05,
'h1_sysbp_max': 0.04,
'h1_sysbp_min': 0.04,
'h1_sysbp_noninvasive_max': 0.08,
'h1_sysbp_noninvasive_min': 0.08,
'd1_glucose_max': 0.06,
'd1_glucose_min': 0.06,
'd1_potassium_max': 0.1,
'd1_potassium_min': 0.1,
'apache_4a_hospital_death_prob': 0.09,
'apache_4a_icu_death_prob': 0.09,
'aids': 0.01,
'cirrhosis': 0.01,
'diabetes_mellitus': 0.01,
'hepatic_failure': 0.01,
'immunosuppression': 0.01,
'leukemia': 0.01,
'lymphoma': 0.01,
'solid_tumor_with_metastasis': 0.01,
'apache_3j_bodysystem': 0.02,
'apache_2_bodysystem': 0.02,
'Unnamed: 83': 1.0,
'hospital_death': 0.0}
# number of null columns
L = []
d = dict(df.isna().sum())
for key, value in d.items():
if value:
L.append(key)
len(L)
75
# sum of total missing
df.isna().sum().sum()
288046
For numeric columns, we impute missing value with interpolation.
#Heatmap displaying missing values
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull())
<AxesSubplot:>
# drop unmaed column
df = df.drop("Unnamed: 83",axis=1)
# identify numeric columns
s = (df.dtypes =="float64")
object_cols = list(s[s].index)
# define function to interpolate value in numeric columns
def fillin(df):
for col in object_cols:
if df[col].isna().sum() != 0:
df[col] = df[col].interpolate()
return df
# impute missing with interpolation
df = fillin(df)
# value count gender
df.gender.value_counts()
M 49469 F 42219 Name: gender, dtype: int64
# fill missing value in gender column with M
df['gender'] = df['gender'].fillna('M')
df = df.dropna()
# count missing after cleaning
df.isna().sum().sum()
0
# shape of cleaned data
df.shape
(88589, 84)
#Check missing value for current data
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull())
<AxesSubplot:>
#Prepare for data profile
def with_hue(plot, feature, Number_of_categories, hue_categories):
a = [p.get_height() for p in plot.patches]
patch = [p for p in plot.patches]
for i in range(Number_of_categories):
total = feature.value_counts().values[i]
for j in range(hue_categories):
percentage = '{:.1f}%'.format(100 * a[(j*Number_of_categories + i)]/total)
x = patch[(j*Number_of_categories + i)].get_x() + patch[(j*Number_of_categories + i)].get_width() / 2 - 0.15
y = patch[(j*Number_of_categories + i)].get_y() + patch[(j*Number_of_categories + i)].get_height()
ax.annotate(percentage, (x, y), size = 12)
plt.show()
def without_hue(plot, feature):
total = len(feature)
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width() / 2 - 0.05
y = p.get_y() + p.get_height()
ax.annotate(percentage, (x, y), size = 12)
plt.show()
#Plot for response
plt.figure(figsize=(7,5))
ax=sns.countplot('hospital_death', data=df)
plt.xticks(size=12)
plt.xlabel('hospital_death', size=12)
plt.yticks(size=12)
plt.ylabel('count',size=12)
without_hue(ax, df.hospital_death)
# Density Plot of each variable
unpivot = pd.melt(df, df.describe().columns[0], df.describe().columns[1:])
unpivot
| encounter_id | variable | value | |
|---|---|---|---|
| 0 | 66154 | patient_id | 25312.0 |
| 1 | 114252 | patient_id | 59342.0 |
| 2 | 119783 | patient_id | 50777.0 |
| 3 | 79267 | patient_id | 46918.0 |
| 4 | 92056 | patient_id | 34377.0 |
| ... | ... | ... | ... |
| 6732759 | 91592 | hospital_death | 0.0 |
| 6732760 | 66119 | hospital_death | 0.0 |
| 6732761 | 8981 | hospital_death | 0.0 |
| 6732762 | 33776 | hospital_death | 0.0 |
| 6732763 | 1671 | hospital_death | 0.0 |
6732764 rows × 3 columns
# density plot for each variable
fig2 = sns.FacetGrid(unpivot, col = 'variable', col_wrap = 3,
sharex = False, sharey = False)
fig2.map(sns.kdeplot, "value")
plt.show()
# plot of hospital_death
f,ax=plt.subplots(1,2,figsize=(18,8))
df['hospital_death'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('hospital_death')
sns.countplot('hospital_death',data=df,ax=ax[1])
ax[1].set_title('hospital_death')
plt.show()
# Percentage of dead people by gender
sub1 = 100 * df.groupby(["gender"])['hospital_death'].sum()/df.gender.value_counts()
# create a sub data frame
sub1 = {'female': sub1[0], 'male': sub1[1]}
sub1 = pd.DataFrame(data=sub1, index = [0])
# Percentage of male who died
sub1 = df.groupby(["gender"])['hospital_death'].sum()[1]/df.gender.value_counts()[0]
# Percentage of female who died
sub2 = df.groupby(["gender"])['hospital_death'].sum()[0]/df.gender.value_counts()[1]
# create a sub data frame
df_gender = {'female': sub2, 'male': sub1}
df_gender = pd.DataFrame(data=df_gender, index = [0])
# plot gender by hospital_death
plt.figure(figsize=(5,5))
sns.barplot(data=df_gender).set(xlabel='Gender', ylabel='percentage of people(%)', title = "hospital_death by gender")
plt.show()
age_death_F=df[df['gender']=='F'][['age','hospital_death']].groupby('age').mean().reset_index()
age_death_M=df[df['gender']=='M'][['age','hospital_death']].groupby('age').mean().reset_index()
fig = make_subplots()
fig.add_trace(
go.Scatter(x=age_death_F['age'], y=age_death_F['hospital_death'], name="Female patients"))
fig.add_trace(
go.Scatter(x=age_death_M['age'], y=age_death_M['hospital_death'],name="Male patients"))
fig.update_layout(
title_text="<b>Average hospital death probability of patients<b>")
fig.update_xaxes(title_text="<b>patient age<b>")
fig.update_yaxes(title_text="<b>Average Hospital Death</b>", secondary_y=False)
fig.show()
# plot age distribution for hospital_death according to gender
import plotly.express as px
fig = px.histogram(df[['age', 'gender', 'hospital_death']].dropna(), x= "age", y = "hospital_death", color = 'gender',
hover_data = df[['age', 'gender', 'hospital_death']].columns)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()
# plot apache_2_diagnosis result for hospital_death
plt.figure(figsize=(8,5))
sns.histplot(data = df, x = "apache_2_diagnosis", kde = True, hue = "hospital_death")
plt.xlabel('apache_2_diagnosis')
plt.title("Count of hospital death by count apache 2 diagnosis ")
Text(0.5, 1.0, 'Count of hospital death by count apache 2 diagnosis ')
# plot death for ICU type
sns.factorplot('icu_type','hospital_death',data=df, aspect=1, size = 8)
plt.title("Death rate by ICU Type")
plt.show()
# plot bar chart apache_2_diagnosis result for hospital_death
plt.figure(figsize=(12,6))
sns.countplot('icu_type',hue='hospital_death',data=df)
plt.show()
#prepare plot for elective surgery
def barPerc(df,xVar,ax):
'''
barPerc(): Add percentage for hues to bar plots
args:
df: pandas dataframe
xVar: (string) X variable
ax: Axes object (for Seaborn Countplot/Bar plot or
pandas bar plot)
'''
# 1. how many X categories
## check for NaN and remove
numX=len([x for x in df[xVar].unique() if x==x])
# 2. The bars are created in hue order, organize them
bars = ax.patches
## 2a. For each X variable
for ind in range(numX):
## 2b. Get every hue bar
## ex. 8 X categories, 4 hues =>
## [0, 8, 16, 24] are hue bars for 1st X category
hueBars=bars[ind:][::numX]
## 2c. Get the total height (for percentages)
total = sum([x.get_height() for x in hueBars])
# 3. Print the percentage on the bars
for bar in hueBars:
ax.text(bar.get_x() + bar.get_width()/2.,
bar.get_height(),
f'{bar.get_height()/total:.0%}',
ha="center",va="bottom")
#prepare plot for elective surgery
elective_surgery_0 = df[df['elective_surgery']==0]
elective_surgery_1 = df[df['elective_surgery']==1]
# plot elective surgery to compare hospital_death for patient hospital life because of
# elective surgery and those are not
f,ax=plt.subplots(1,3,figsize=(18,8))
a = df['elective_surgery'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
b = sns.countplot(x= 'elective_surgery',hue="hospital_death",ax=ax[1],data=elective_surgery_0)
c = sns.countplot(x= 'elective_surgery',hue="hospital_death",ax=ax[2],data=elective_surgery_1)
barPerc(elective_surgery_0,'elective_surgery',b)
barPerc(elective_surgery_1,'elective_surgery',c)
#Similar plot for variable intubated_apache
intubated_apache_0 = df[df['intubated_apache']==0]
intubated_apache_1 = df[df['intubated_apache']==1]
f,ax=plt.subplots(1,3,figsize=(18,8))
a = df['intubated_apache'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
b = sns.countplot(x= 'intubated_apache',hue="hospital_death",ax=ax[1],data=intubated_apache_0)
c = sns.countplot(x= 'intubated_apache',hue="hospital_death",ax=ax[2],data=intubated_apache_1)
barPerc(elective_surgery_0,'elective_surgery',b)
barPerc(elective_surgery_1,'elective_surgery',c)
# correlation matrix to check collinearity
corrMatrix = df.corr()
corrMatrix
| encounter_id | patient_id | hospital_id | age | bmi | elective_surgery | height | icu_id | pre_icu_los_days | weight | ... | apache_4a_icu_death_prob | aids | cirrhosis | diabetes_mellitus | hepatic_failure | immunosuppression | leukemia | lymphoma | solid_tumor_with_metastasis | hospital_death | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| encounter_id | 1.000000 | -0.008796 | -0.004100 | -0.004463 | -0.000358 | 0.000520 | -0.005201 | -0.001061 | 0.000809 | -0.003408 | ... | 0.000091 | 0.002607 | 0.008076 | 0.003607 | -0.000490 | -0.001988 | -0.002406 | -0.000677 | -0.005137 | -0.005620 |
| patient_id | -0.008796 | 1.000000 | -0.007137 | 0.005682 | -0.001963 | 0.002698 | 0.002230 | -0.001996 | -0.003230 | -0.000500 | ... | 0.001803 | -0.002055 | 0.001888 | 0.000227 | -0.001922 | 0.001189 | 0.000537 | 0.003104 | -0.003300 | 0.004425 |
| hospital_id | -0.004100 | -0.007137 | 1.000000 | -0.008687 | 0.011674 | 0.055223 | 0.028280 | 0.007623 | -0.004012 | 0.025116 | ... | 0.002424 | -0.004542 | 0.002583 | 0.011235 | 0.000676 | 0.000174 | -0.002734 | 0.002280 | -0.004989 | -0.001815 |
| age | -0.004463 | 0.005682 | -0.008687 | 1.000000 | -0.082475 | 0.067602 | -0.106102 | -0.026505 | 0.052547 | -0.121701 | ... | 0.076188 | -0.029720 | -0.028035 | 0.075190 | -0.019793 | 0.024915 | 0.029530 | 0.023281 | 0.025218 | 0.107402 |
| bmi | -0.000358 | -0.001963 | 0.011674 | -0.082475 | 1.000000 | 0.015102 | -0.054137 | -0.000627 | -0.000811 | 0.872744 | ... | -0.012799 | -0.020555 | -0.003107 | 0.169199 | -0.002008 | -0.029172 | -0.014263 | -0.010686 | -0.042228 | -0.030229 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| immunosuppression | -0.001988 | 0.001189 | 0.000174 | 0.024915 | -0.029172 | -0.014703 | 0.000566 | -0.030682 | 0.034976 | -0.027845 | ... | 0.025074 | 0.023742 | -0.003010 | -0.002744 | 0.003080 | 1.000000 | 0.136148 | 0.102447 | 0.269740 | 0.043906 |
| leukemia | -0.002406 | 0.000537 | -0.002734 | 0.029530 | -0.014263 | -0.017289 | 0.001822 | 0.001361 | 0.044404 | -0.013378 | ... | 0.030639 | -0.002502 | -0.005429 | 0.002006 | -0.001589 | 0.136148 | 1.000000 | 0.031854 | 0.005467 | 0.030132 |
| lymphoma | -0.000677 | 0.003104 | 0.002280 | 0.023281 | -0.010686 | -0.008722 | 0.008870 | -0.001340 | 0.015038 | -0.005878 | ... | 0.011958 | 0.021813 | 0.001536 | -0.002603 | 0.001692 | 0.102447 | 0.031854 | 1.000000 | 0.013808 | 0.019205 |
| solid_tumor_with_metastasis | -0.005137 | -0.003300 | -0.004989 | 0.025218 | -0.042228 | 0.015374 | 0.005183 | -0.013625 | 0.038971 | -0.038058 | ... | 0.026523 | -0.001617 | -0.005896 | -0.013155 | 0.007391 | 0.269740 | 0.005467 | 0.013808 | 1.000000 | 0.051014 |
| hospital_death | -0.005620 | 0.004425 | -0.001815 | 0.107402 | -0.030229 | -0.091322 | -0.019924 | 0.001573 | 0.065339 | -0.037726 | ... | 0.288236 | 0.004545 | 0.038526 | -0.015704 | 0.038288 | 0.043906 | 0.030132 | 0.019205 | 0.051014 | 1.000000 |
77 rows × 77 columns
# heatmap displaying correlation Matrix
sns.set(rc={'figure.figsize':(25,25)})
# show variables with correlation values greater 0.7
sns.heatmap(corrMatrix[corrMatrix > 0.7], annot=True)
plt.title("Correlation Matrix Plot")
Text(0.5, 1.0, 'Correlation Matrix Plot')
# dunmmy code gender
df['gender'] = df['gender'].apply(lambda x:1 if x == "F" else 0)
# one hot encode
df = pd.get_dummies(df)
df.shape
(88589, 121)
df = df.drop(["encounter_id", "patient_id"], axis=1)
# Split into two datasets, X with predictors, y with response
X, y = df.drop(['hospital_death'], axis = 1), df['hospital_death']
stscaler = StandardScaler().fit(X)
scaled = stscaler.transform(X)
# we select only 10 componenets
pca = PCA(n_components = 10)
pca.fit(X)
PCA(n_components=10)
fig, ax = plt.subplots(figsize=(10,6))
ax.set_xlabel('Dimension #')
ax.set_ylabel('Explained Variance Ratio')
ax.set_title('Fraction of Explained Variance')
ax.plot(pca.explained_variance_ratio_)
[<matplotlib.lines.Line2D at 0x1c8af4aaeb0>]
PCnames = ['PC'+str(i+1) for i in range(pca.n_components_)]
pca.components_ = np.array(pca.components_).reshape(118, 10)
Loadings = pd.DataFrame(pca.components_,columns=PCnames,index=X.columns)
# select first 2 components
Loadings.iloc[:,:2]
| PC1 | PC2 | |
|---|---|---|
| hospital_id | 4.346235e-03 | -0.001999 |
| age | 9.966379e-01 | 0.000762 |
| bmi | -4.471325e-03 | -0.000004 |
| elective_surgery | -1.624285e-05 | -0.006279 |
| gender | 9.753775e-04 | 0.000190 |
| ... | ... | ... |
| apache_2_bodysystem_Renal/Genitourinary | -8.365639e-06 | -0.000215 |
| apache_2_bodysystem_Respiratory | 2.040913e-05 | -0.000032 |
| apache_2_bodysystem_Trauma | 3.116153e-07 | -0.000074 |
| apache_2_bodysystem_Undefined Diagnoses | 3.159674e-04 | 0.000002 |
| apache_2_bodysystem_Undefined diagnoses | -1.175792e-04 | 0.000316 |
118 rows × 2 columns
# pca 1 bar plot
Loadings["PC1"].sort_values().plot.barh()
<AxesSubplot:>
# pca 2 bar plot
Loadings["PC2"].sort_values().plot.barh()
<AxesSubplot:>
# select important features
X_pca = X[['age', 'apache_3j_bodysystem_Trauma', 'd1_sysbp_noninvasive_min', 'apache_2_bodysystem_Neurologic',
'apache_3j_bodysystem_Genitourinary', 'd1_spo2_min', 'd1_mbp_max', 'd1_temp_max',
'd1_glucose_max', 'h1_sysbp_noninvasive_max', 'icu_stay_type_readmit',
'icu_admit_source_Other ICU', 'h1_resprate_max',
'apache_3j_bodysystem_Cardiovascular', 'solid_tumor_with_metastasis', 'apache_2_bodysystem_Haematologic',
'ethnicity_Asian', 'icu_stay_type_admit', 'icu_stay_type_transfer',
'apache_2_bodysystem_Gastrointestinal',
'ethnicity_Caucasian', 'd1_heartrate_min',
'd1_potassium_max',
'd1_glucose_min', 'h1_mbp_min', 'h1_heartrate_min',
'apache_3j_bodysystem_Gynecological', 'lymphoma',
'ethnicity_African American', 'apache_2_bodysystem_Metabolic',
'icu_type_SICU', 'apache_2_bodysystem_Cardiovascular']]
'd1_sysbp_noninvasive_max', 'd1_sysbp_max', 'h1_mbp_noninvasive_max', 'd1_diasbp_noninvasive_min', 'apache_3j_bodysystem_Gastrointestinal,'h1_mbp_noninvasive_min', 'h1_mbp_max', 'd1_sysbp_min'
# number of selected features
len(X_pca.columns)
32
X_pca.columns
Index(['age', 'apache_3j_bodysystem_Trauma', 'd1_sysbp_noninvasive_min',
'apache_2_bodysystem_Neurologic', 'apache_3j_bodysystem_Genitourinary',
'd1_spo2_min', 'd1_mbp_max', 'd1_temp_max', 'd1_glucose_max',
'h1_sysbp_noninvasive_max', 'icu_stay_type_readmit',
'icu_admit_source_Other ICU', 'h1_resprate_max',
'apache_3j_bodysystem_Cardiovascular', 'solid_tumor_with_metastasis',
'apache_2_bodysystem_Haematologic', 'ethnicity_Asian',
'icu_stay_type_admit', 'icu_stay_type_transfer',
'apache_2_bodysystem_Gastrointestinal', 'ethnicity_Caucasian',
'd1_heartrate_min', 'd1_potassium_max', 'd1_glucose_min', 'h1_mbp_min',
'h1_heartrate_min', 'apache_3j_bodysystem_Gynecological', 'lymphoma',
'ethnicity_African American', 'apache_2_bodysystem_Metabolic',
'icu_type_SICU', 'apache_2_bodysystem_Cardiovascular'],
dtype='object')
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X_pca.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X_pca.values, i)
for i in range(len(X_pca.columns))]
vif_data = pd.DataFrame(vif_data)
vif_data
| feature | VIF | |
|---|---|---|
| 0 | age | 1.201421 |
| 1 | apache_3j_bodysystem_Trauma | 1.238756 |
| 2 | d1_sysbp_noninvasive_min | 1.612316 |
| 3 | apache_2_bodysystem_Neurologic | 1.686407 |
| 4 | apache_3j_bodysystem_Genitourinary | 1.129171 |
| 5 | d1_spo2_min | 1.116406 |
| 6 | d1_mbp_max | 1.486617 |
| 7 | d1_temp_max | 1.095657 |
| 8 | d1_glucose_max | 1.320305 |
| 9 | h1_sysbp_noninvasive_max | 1.958513 |
| 10 | icu_stay_type_readmit | 22.011346 |
| 11 | icu_admit_source_Other ICU | 1.015118 |
| 12 | h1_resprate_max | 1.137979 |
| 13 | apache_3j_bodysystem_Cardiovascular | 2.137271 |
| 14 | solid_tumor_with_metastasis | 1.009992 |
| 15 | apache_2_bodysystem_Haematologic | 1.037935 |
| 16 | ethnicity_Asian | 1.113664 |
| 17 | icu_stay_type_admit | 3254.250352 |
| 18 | icu_stay_type_transfer | 183.568043 |
| 19 | apache_2_bodysystem_Gastrointestinal | 1.463269 |
| 20 | ethnicity_Caucasian | 1.965351 |
| 21 | d1_heartrate_min | 1.905610 |
| 22 | d1_potassium_max | 1.092980 |
| 23 | d1_glucose_min | 1.230969 |
| 24 | h1_mbp_min | 1.997788 |
| 25 | h1_heartrate_min | 2.008103 |
| 26 | apache_3j_bodysystem_Gynecological | 1.025808 |
| 27 | lymphoma | 1.002761 |
| 28 | ethnicity_African American | 1.876258 |
| 29 | apache_2_bodysystem_Metabolic | 1.572000 |
| 30 | icu_type_SICU | 1.021476 |
| 31 | apache_2_bodysystem_Cardiovascular | 2.580895 |
# drop columns with VIF greater than 5
X_pca = X_pca.drop(["icu_stay_type_readmit","icu_stay_type_admit","icu_stay_type_transfer"], axis=1)
# correlation matrix to check collinearity
corrMatrix = X_pca.corr()
corrMatrix
| age | apache_3j_bodysystem_Trauma | d1_sysbp_noninvasive_min | apache_2_bodysystem_Neurologic | apache_3j_bodysystem_Genitourinary | d1_spo2_min | d1_mbp_max | d1_temp_max | d1_glucose_max | h1_sysbp_noninvasive_max | ... | d1_potassium_max | d1_glucose_min | h1_mbp_min | h1_heartrate_min | apache_3j_bodysystem_Gynecological | lymphoma | ethnicity_African American | apache_2_bodysystem_Metabolic | icu_type_SICU | apache_2_bodysystem_Cardiovascular | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 1.000000 | -0.083086 | -0.058718 | -0.006263 | -0.012198 | -0.080593 | 0.006076 | -0.079255 | 0.010944 | 0.044158 | ... | 0.053787 | 0.058854 | -0.101066 | -0.157588 | -0.061493 | 0.023281 | -0.120158 | -0.282046 | 0.005675 | 0.153531 |
| apache_3j_bodysystem_Trauma | -0.083086 | 1.000000 | 0.035438 | -0.082700 | -0.033211 | 0.018482 | 0.007857 | 0.049672 | -0.056753 | 0.036333 | ... | -0.018782 | 0.004197 | 0.022712 | -0.020471 | -0.012538 | -0.008512 | -0.026504 | -0.064180 | 0.054231 | -0.183662 |
| d1_sysbp_noninvasive_min | -0.058718 | 0.035438 | 1.000000 | 0.140708 | -0.007654 | 0.223772 | 0.222383 | -0.103995 | -0.023085 | 0.440039 | ... | -0.123294 | 0.062948 | 0.541126 | -0.100796 | 0.015756 | -0.017099 | 0.069753 | 0.046253 | 0.016594 | -0.103678 |
| apache_2_bodysystem_Neurologic | -0.006263 | -0.082700 | 0.140708 | 1.000000 | -0.061554 | 0.057663 | 0.130923 | -0.000004 | -0.074708 | 0.168988 | ... | -0.102185 | -0.000538 | 0.145890 | -0.112793 | -0.023238 | -0.004278 | 0.007405 | -0.118953 | -0.023067 | -0.340403 |
| apache_3j_bodysystem_Genitourinary | -0.012198 | -0.033211 | -0.007654 | -0.061554 | 1.000000 | 0.004182 | -0.003460 | -0.018101 | -0.005209 | -0.015230 | ... | 0.089476 | -0.018829 | -0.013693 | -0.001605 | -0.009332 | 0.004615 | 0.019410 | -0.047770 | 0.000639 | -0.133134 |
| d1_spo2_min | -0.080593 | 0.018482 | 0.223772 | 0.057663 | 0.004182 | 1.000000 | -0.063674 | -0.041294 | -0.031939 | 0.031899 | ... | -0.095592 | -0.009620 | 0.133804 | -0.044671 | 0.017029 | -0.010100 | 0.020823 | 0.065084 | 0.029701 | -0.060372 |
| d1_mbp_max | 0.006076 | 0.007857 | 0.222383 | 0.130923 | -0.003460 | -0.063674 | 1.000000 | -0.005904 | 0.019472 | 0.524589 | ... | -0.041732 | 0.035141 | 0.432098 | -0.006792 | -0.006709 | -0.008267 | 0.090620 | 0.018409 | -0.009412 | -0.038879 |
| d1_temp_max | -0.079255 | 0.049672 | -0.103995 | -0.000004 | -0.018101 | -0.041294 | -0.005904 | 1.000000 | 0.001521 | -0.015762 | ... | -0.015775 | -0.024356 | -0.097278 | 0.193160 | -0.006705 | 0.003370 | -0.020140 | -0.040342 | 0.034177 | -0.042381 |
| d1_glucose_max | 0.010944 | -0.056753 | -0.023085 | -0.074708 | -0.005209 | -0.031939 | 0.019472 | 0.001521 | 1.000000 | 0.028522 | ... | 0.181871 | 0.385341 | -0.026445 | 0.122779 | -0.011640 | -0.002596 | 0.017783 | 0.165679 | -0.002528 | 0.007430 |
| h1_sysbp_noninvasive_max | 0.044158 | 0.036333 | 0.440039 | 0.168988 | -0.015230 | 0.031899 | 0.524589 | -0.015762 | 0.028522 | 1.000000 | ... | -0.064362 | 0.056745 | 0.588699 | -0.084990 | 0.003651 | -0.012975 | 0.065439 | 0.003390 | -0.002267 | -0.087443 |
| icu_admit_source_Other ICU | 0.006366 | -0.009104 | 0.001952 | 0.028700 | 0.000650 | -0.006031 | -0.000329 | 0.000873 | -0.004641 | -0.001049 | ... | 0.001197 | 0.005867 | -0.001199 | 0.007612 | -0.002526 | 0.003263 | -0.008371 | -0.004499 | -0.005990 | -0.005279 |
| h1_resprate_max | 0.025842 | -0.041530 | -0.089148 | -0.060036 | -0.003420 | -0.132751 | 0.122098 | 0.084161 | 0.052893 | 0.071110 | ... | 0.002208 | 0.053916 | 0.001255 | 0.229163 | -0.018846 | 0.018809 | 0.008538 | -0.027927 | -0.020725 | 0.060249 |
| apache_3j_bodysystem_Cardiovascular | 0.139444 | -0.148908 | 0.018319 | -0.275989 | -0.110833 | -0.006021 | -0.023614 | -0.110655 | -0.004390 | 0.020282 | ... | 0.075040 | -0.020423 | 0.016865 | -0.183195 | -0.041841 | -0.014907 | -0.005892 | -0.214186 | -0.012803 | 0.670963 |
| solid_tumor_with_metastasis | 0.025218 | -0.023760 | -0.022421 | 0.007202 | 0.005420 | -0.011296 | -0.021230 | -0.007725 | -0.013073 | -0.024187 | ... | -0.002199 | 0.010030 | -0.013302 | 0.046457 | 0.003377 | 0.013808 | -0.002292 | -0.027244 | 0.011504 | -0.009880 |
| apache_2_bodysystem_Haematologic | -0.005900 | -0.017948 | 0.000053 | -0.033266 | -0.013359 | -0.001647 | -0.017183 | 0.012919 | -0.021093 | -0.019614 | ... | -0.007015 | -0.004090 | -0.018121 | 0.019588 | -0.005043 | 0.021426 | 0.019097 | -0.025816 | -0.000311 | -0.073878 |
| ethnicity_Asian | -0.006110 | -0.005765 | 0.000813 | 0.008724 | 0.006064 | 0.015763 | -0.002037 | 0.004016 | 0.002637 | -0.003855 | ... | -0.006614 | 0.001342 | 0.004361 | 0.001861 | 0.008736 | 0.002109 | -0.038762 | -0.006918 | -0.004786 | -0.003836 |
| apache_2_bodysystem_Gastrointestinal | 0.049390 | -0.070553 | -0.015817 | -0.130765 | -0.052513 | 0.004971 | -0.037433 | -0.005885 | -0.040026 | -0.026725 | ... | -0.008857 | 0.003070 | -0.032585 | 0.050883 | -0.019825 | -0.000718 | -0.015248 | -0.101482 | 0.070227 | -0.290406 |
| ethnicity_Caucasian | 0.136964 | 0.022876 | -0.063920 | -0.011448 | -0.021611 | -0.061182 | -0.036930 | 0.013379 | -0.035791 | -0.034059 | ... | 0.002556 | 0.015562 | -0.082877 | -0.038769 | -0.026533 | 0.012159 | -0.652078 | -0.017559 | 0.003157 | 0.011031 |
| d1_heartrate_min | -0.139413 | -0.004945 | 0.030813 | -0.096701 | 0.009956 | 0.084661 | -0.070805 | 0.176153 | 0.092650 | -0.106125 | ... | -0.009394 | 0.052251 | 0.014991 | 0.659988 | 0.011300 | 0.011773 | 0.039267 | 0.048460 | 0.019380 | -0.061368 |
| d1_potassium_max | 0.053787 | -0.018782 | -0.123294 | -0.102185 | 0.089476 | -0.095592 | -0.041732 | -0.015775 | 0.181871 | -0.064362 | ... | 1.000000 | -0.000767 | -0.120822 | -0.002402 | 0.006258 | 0.002796 | 0.010195 | -0.028960 | 0.017130 | 0.030145 |
| d1_glucose_min | 0.058854 | 0.004197 | 0.062948 | -0.000538 | -0.018829 | -0.009620 | 0.035141 | -0.024356 | 0.385341 | 0.056745 | ... | -0.000767 | 1.000000 | 0.045512 | 0.062805 | 0.001179 | -0.000031 | -0.014529 | -0.073451 | -0.001394 | 0.008358 |
| h1_mbp_min | -0.101066 | 0.022712 | 0.541126 | 0.145890 | -0.013693 | 0.133804 | 0.432098 | -0.097278 | -0.026445 | 0.588699 | ... | -0.120822 | 0.045512 | 1.000000 | 0.023219 | 0.009444 | -0.011890 | 0.108169 | 0.056627 | 0.032803 | -0.081305 |
| h1_heartrate_min | -0.157588 | -0.020471 | -0.100796 | -0.112793 | -0.001605 | -0.044671 | -0.006792 | 0.193160 | 0.122779 | -0.084990 | ... | -0.002402 | 0.062805 | 0.023219 | 1.000000 | -0.004277 | 0.017476 | 0.030658 | 0.063587 | 0.001699 | -0.030799 |
| apache_3j_bodysystem_Gynecological | -0.061493 | -0.012538 | 0.015756 | -0.023238 | -0.009332 | 0.017029 | -0.006709 | -0.006705 | -0.011640 | 0.003651 | ... | 0.006258 | 0.001179 | 0.009444 | -0.004277 | 1.000000 | -0.003849 | 0.017460 | -0.018034 | 0.023850 | -0.051607 |
| lymphoma | 0.023281 | -0.008512 | -0.017099 | -0.004278 | 0.004615 | -0.010100 | -0.008267 | 0.003370 | -0.002596 | -0.012975 | ... | 0.002796 | -0.000031 | -0.011890 | 0.017476 | -0.003849 | 1.000000 | -0.010356 | -0.012163 | 0.001239 | 0.008218 |
| ethnicity_African American | -0.120158 | -0.026504 | 0.069753 | 0.007405 | 0.019410 | 0.020823 | 0.090620 | -0.020140 | 0.017783 | 0.065439 | ... | 0.010195 | -0.014529 | 0.108169 | 0.030658 | 0.017460 | -0.010356 | 1.000000 | 0.015392 | 0.028398 | -0.002473 |
| apache_2_bodysystem_Metabolic | -0.282046 | -0.064180 | 0.046253 | -0.118953 | -0.047770 | 0.065084 | 0.018409 | -0.040342 | 0.165679 | 0.003390 | ... | -0.028960 | -0.073451 | 0.056627 | 0.063587 | -0.018034 | -0.012163 | 0.015392 | 1.000000 | -0.038833 | -0.264175 |
| icu_type_SICU | 0.005675 | 0.054231 | 0.016594 | -0.023067 | 0.000639 | 0.029701 | -0.009412 | 0.034177 | -0.002528 | -0.002267 | ... | 0.017130 | -0.001394 | 0.032803 | 0.001699 | 0.023850 | 0.001239 | 0.028398 | -0.038833 | 1.000000 | -0.052545 |
| apache_2_bodysystem_Cardiovascular | 0.153531 | -0.183662 | -0.103678 | -0.340403 | -0.133134 | -0.060372 | -0.038879 | -0.042381 | 0.007430 | -0.087443 | ... | 0.030145 | 0.008358 | -0.081305 | -0.030799 | -0.051607 | 0.008218 | -0.002473 | -0.264175 | -0.052545 | 1.000000 |
29 rows × 29 columns
# heatmap displaying correlation Matrix
sns.set(rc={'figure.figsize':(20,20)})
# show variables with correlation values greater 0.7
sns.heatmap(corrMatrix[corrMatrix > 0.7],annot=True)
<AxesSubplot:>
sns.heatmap(corrMatrix[corrMatrix < -0.7],annot=True)
<AxesSubplot:>
df_full = pd.concat([X_pca,y], axis=1)
df_full.shape
(88589, 30)
# Split into two datasets, one with predictors, one with response
X, y = df_full.drop(['hospital_death'], axis = 1), df_full['hospital_death']
#SMOTE before split
#sm = SMOTE(random_state=12)
#X, y = sm.fit_resample(X, y)
# Split PCA data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 12, stratify=y)
y = pd.DataFrame(y_train)
# density plot of hospital_death
sns.set(rc={'figure.figsize':(10,6)})
sns.distplot(y['hospital_death'])
<AxesSubplot:xlabel='hospital_death', ylabel='Density'>
y.hospital_death.value_counts()
0 64737 1 6134 Name: hospital_death, dtype: int64
# Apply SMOTE to X in training set
sm = SMOTE(random_state=12)
X_train, y_train= sm.fit_resample(X_train, y_train)
# Split training data into training and test
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, train_size = 0.8, test_size = 0.2, random_state = 12, stratify=y_train)
# convert y to pd data frame
y = pd.DataFrame(y_train)
# density plot of hospital_death after balancing
sns.set(rc={'figure.figsize':(10,6)})
sns.distplot(y['hospital_death'])
<AxesSubplot:xlabel='hospital_death', ylabel='Density'>
# count in training dataset after smote
y.value_counts()
hospital_death 0 64737 1 64737 dtype: int64
# cross validation on training data
rf = RandomForestClassifier()
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring='accuracy')
# crosss validation accuracy
cv_rf=np.mean(rf_scores)
cv_rf
0.935486131775708
# comput training accuracy
rf.fit(train_X, train_y)
y_pred = rf.predict(test_X)
tr_rf_accuracy_metric = accuracy_score(y_pred, test_y)
tr_rf_accuracy_metric
0.9329986483877196
## comput test accuracy
te_rf_accuracy_metric=rf.score(X_test, y_test)
te_rf_accuracy_metric
0.879218873462016
from sklearn.metrics import f1_score
rf_f1 = f1_score(test_y, y_pred, average='micro')
rf_f1
0.9329986483877196
#calculating precision and reall
precision_rf = precision_score(test_y, y_pred)
recall_rf = recall_score(test_y, y_pred)
print('Precision rf: ',precision_rf)
print('Recall rf: ',recall_rf)
Precision rf: 0.930838392376854 Recall rf: 0.9355112758727216
rf_imp = rf.feature_importances_
np.sort(rf_imp)[-5:].sum()
0.3827385395915027
# Top 20 most important features of this model
sorted_idx = rf.feature_importances_.argsort()[-20:]
plt.figure(figsize=(5,5))
plt.barh(train_X.columns[sorted_idx], rf.feature_importances_[sorted_idx])
plt.suptitle('Important Variables From Random Forest')
#plt.savefig('rf_importance.png')
Text(0.5, 0.98, 'Important Variables From Random Forest')
# confusion matrix
figure(figsize=(80, 60), dpi=80)
plot_confusion_matrix(rf, test_X, test_y)
plt.show()
<Figure size 6400x4800 with 0 Axes>
# precision and recall curve
# predict probabilities
lr_probs = rf.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = rf.predict(X_test)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)
# summarize scores
print('Random Forest: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
# plot the precision-recall curves
figure(figsize=(6, 4), dpi=80)
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Random Forest')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Random Forest: f1=0.322 auc=0.288
# cross validation on post-PCA training data
dtree = DecisionTreeClassifier()
dtree_scores_pca = cross_val_score(dtree, X_train, y_train, cv=10, scoring='accuracy')
# crosss validation accuracy
cv_dt=np.mean(dtree_scores_pca)
cv_dt
0.8774745320499278
# compute training accuracy
d_fit = dtree.fit(train_X, train_y)
y_pred = dtree.predict(test_X)
tr_dt_accuracy_metric = accuracy_score(y_pred, test_y)
tr_dt_accuracy_metric
0.8751110252944584
## comput test accuracy
te_dt_accuracy_metric=dtree.score(X_test, y_test)
te_dt_accuracy_metric
0.8196749068743651
dtree_f1 = f1_score(test_y, y_pred, average='micro')
dtree_f1
0.8751110252944584
#calculating precision and reall
precision_dtree = precision_score(test_y, y_pred)
recall_dtree = recall_score(test_y, y_pred)
print('Precision dtree: ',precision_dtree)
print('Recall dtree: ',recall_dtree)
Precision dtree: 0.8637657279808268 Recall dtree: 0.8907167130058696
# tree plot
#fig = plt.figure(figsize=(25,20))
#plot_tree(dtree.fit(train_X, train_y), filled=True)
#plt.show()
# confusion matrix
figure(figsize=(80, 60), dpi=80)
plot_confusion_matrix(dtree, test_X, test_y)
plt.show()
<Figure size 6400x4800 with 0 Axes>
# precision and recall curve
# predict probabilities
lr_probs = dtree.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = dtree.predict(X_test)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)
# summarize scores
print('Decision Tree: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
# plot the precision-recall curves
figure(figsize=(6, 4), dpi=80)
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Decision Tree')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Decision Tree: f1=0.255 auc=0.306
# cross validation on post-PCA training data
logit = LogisticRegression()
logit_scores_pca = cross_val_score(logit, X_train, y_train, cv=10, scoring='accuracy')
# crosss validation accuracy
cv_lr=np.mean(logit_scores_pca)
cv_lr
0.7444276526822514
# compute training accuracy
logit.fit(train_X, train_y)
y_pred_pca = logit.predict(test_X)
tr_lr_accuracy_metric = accuracy_score(y_pred_pca, test_y)
tr_lr_accuracy_metric
0.7483297933964086
## comput test accuracy
te_lr_accuracy_metric=logit.score(X_test, y_test)
te_lr_accuracy_metric
0.729879218873462
logit_f1 = f1_score(test_y, y_pred_pca, average='micro')
logit_f1
0.7483297933964086
#calculating precision and reall
precision_logit = precision_score(test_y, y_pred_pca)
recall_logit = recall_score(test_y, y_pred_pca)
print('Precision logit: ',precision_logit)
print('Recall logit: ',recall_logit)
Precision logit: 0.7474413235859946 Recall logit: 0.7501544640098857
figure(figsize=(80, 60), dpi=80)
plot_confusion_matrix(logit, test_X, test_y)
plt.show()
<Figure size 6400x4800 with 0 Axes>
# precision and recall curve
# predict probabilities
lr_probs = logit.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = logit.predict(X_test)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)
# summarize scores
print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
# plot the precision-recall curves
figure(figsize=(6, 4), dpi=80)
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic Regression')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Logistic: f1=0.278 auc=0.267
# cross validation on post-PCA training data
knn = KNeighborsClassifier()
knn_scores_pca = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
# crosss validation accuracy
cv_knn=np.mean(knn_scores_pca)
cv_knn
0.8801766930118058
# compute training accuracy
k_fit = knn.fit(train_X, train_y)
y_pred = knn.predict(test_X)
tr_knn_accuracy_metric = accuracy_score(y_pred, test_y)
tr_knn_accuracy_metric
0.8746476153697625
#accuracy of our model on the test data
te_knn_accuracy_metric=knn.score(X_test, y_test)
te_knn_accuracy_metric
0.7294277006434134
knn_f1 = f1_score(test_y, y_pred, average='micro')
knn_f1
0.8746476153697625
#calculating precision and reall
precision_knn = precision_score(test_y, y_pred)
recall_knn = recall_score(test_y, y_pred)
print('Precision KNN: ',precision_knn)
print('Recall KNN: ',recall_knn)
Precision KNN: 0.8004087193460491 Recall KNN: 0.9982236638863144
# confusion matrix
figure(figsize=(80, 60), dpi=80)
plot_confusion_matrix(knn, test_X, test_y)
plt.show()
<Figure size 6400x4800 with 0 Axes>
# precision and recall curve
# predict probabilities
lr_probs = knn.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = knn.predict(X_test)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)
# summarize scores
print('KNN: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
# plot the precision-recall curves
figure(figsize=(6, 4), dpi=80)
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='KNN')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
KNN: f1=0.249 auc=0.260
## We combined all metrics we have and create a table for model comparison
models = pd.DataFrame({
'Model': ['Random Forest','Decision Tree','Logistic Regression',
'KNN'],
'10-Fold CV Accuracy': [cv_rf, cv_dt,
cv_lr, cv_knn],
'Train Accuracy': [tr_rf_accuracy_metric, tr_dt_accuracy_metric,
tr_lr_accuracy_metric, tr_knn_accuracy_metric],
'Test Accuracy': [te_rf_accuracy_metric, te_dt_accuracy_metric,
te_lr_accuracy_metric, te_knn_accuracy_metric],
'F1-Score': [rf_f1, dtree_f1, logit_f1,knn_f1],
'Precision':[precision_rf, precision_dtree, precision_logit, precision_knn],
'Recall':[recall_rf, recall_dtree, recall_logit, recall_knn]
})
models.sort_values(by='Train Accuracy', ascending=False)
| Model | 10-Fold CV Accuracy | Train Accuracy | Test Accuracy | F1-Score | Precision | Recall | |
|---|---|---|---|---|---|---|---|
| 0 | Random Forest | 0.935486 | 0.932999 | 0.879219 | 0.932999 | 0.930838 | 0.935511 |
| 1 | Decision Tree | 0.877475 | 0.875111 | 0.819675 | 0.875111 | 0.863766 | 0.890717 |
| 3 | KNN | 0.880177 | 0.874648 | 0.729428 | 0.874648 | 0.800409 | 0.998224 |
| 2 | Logistic Regression | 0.744428 | 0.748330 | 0.729879 | 0.748330 | 0.747441 | 0.750154 |